import io
import zipfile
from pathlib import Path

import pandas as pd


def read_first_dta_from_zip(zip_path: Path) -> pd.DataFrame:
    with zipfile.ZipFile(zip_path, "r") as zf:
        dta_names = [n for n in zf.namelist() if n.lower().endswith(".dta")]
        if not dta_names:
            raise FileNotFoundError(f"No .dta in {zip_path}")
        if len(dta_names) != 1:
            # Pick the most likely one (largest file).
            dta_names.sort(key=lambda n: zf.getinfo(n).file_size, reverse=True)
        with zf.open(dta_names[0], "r") as fp:
            data = fp.read()
    return pd.read_stata(io.BytesIO(data), convert_categoricals=False)


def main() -> None:
    root = Path(__file__).resolve().parents[1]
    ess = root / "ess_data"
    out_dir = root / "outputs"
    out_dir.mkdir(parents=True, exist_ok=True)

    coverage = pd.read_csv(ess / "integrated_files_coverage.csv")
    coverage = coverage[coverage["datafile_title"].str.startswith("ESS", na=False)]
    cov_by_round = {}
    for _, r in coverage.iterrows():
        round_s = str(r["round"]).strip()
        if round_s not in cov_by_round:
            cov_by_round[round_s] = r

    rounds = ["8", "9", "10", "11"]
    zips = []
    for round_s in rounds:
        # Prefer the highest edition for that round (e.g., ESS10 integrated e3.3, not self-completion).
        candidates = sorted((ess / "integrated_files").glob(f"ESS{round_s}_integrated_*.zip"))
        if not candidates:
            raise FileNotFoundError(f"Missing zip for round {round_s}")
        # Filter out self-completion if present in filename (it doesn't match the pattern, but be safe).
        candidates = [p for p in candidates if "self" not in p.name.lower()]
        zips.append(candidates[-1])

    columns_wanted = [
        # IDs
        "idno",
        "cntry",
        "regunit",
        "region",
        # weights
        "dweight",
        "pspwght",
        "pweight",
        "anweight",
        # internet / news
        "netusoft",
        "netustm",
        "nwspol",
        "pstplonl",
        # mechanisms / attitudes (available in ESS9-11; used in extended mechanism + stress tests)
        "polintr",
        "ppltrst",
        "stfdem",
        "trstplt",
        "trstprl",
        "trstprt",
        "actrolga",
        "prtdgcl",
        "inprdsc",
        # participation
        "vote",
        "contplt",
        "badge",
        "sgnptit",
        "pbldmn",
        "pbldmna",
        "bctprd",
        "wrkprty",
        "wrkorg",
        # controls / inequality dimensions
        "eisced",
        "eduyrs",
        "agea",
        "gndr",
        "hinctnta",
        "brncntr",
        "domicil",
        "uempla",
    ]

    frames = []
    for zip_path in zips:
        round_s = zip_path.stem.split("_")[0].replace("ESS", "")
        df = read_first_dta_from_zip(zip_path)
        keep = [c for c in columns_wanted if c in df.columns]
        df = df[keep].copy()
        df.insert(0, "ess_round", int(round_s))

        cov = cov_by_round.get(round_s)
        if cov is not None:
            df["fieldwork_start"] = cov.get("fieldwork_start")
            df["fieldwork_end"] = cov.get("fieldwork_end")
            df["start_year"] = int(cov.get("start_year"))
            df["end_year"] = int(cov.get("end_year"))
            df["survey_year"] = int(round((df["start_year"].iloc[0] + df["end_year"].iloc[0]) / 2))
        frames.append(df)

        print(f"Loaded round {round_s}: rows={len(df):,} cols={len(df.columns)} from {zip_path.name}")

    out = pd.concat(frames, ignore_index=True)
    out_path = out_dir / "ess_r8_r11_min.parquet"
    out.to_parquet(out_path, index=False)

    # Also write a small country list for external data fetch.
    countries = sorted({str(x) for x in out["cntry"].dropna().unique()})
    (out_dir / "ess_r8_r11_countries.txt").write_text("\n".join(countries) + "\n", encoding="utf-8")

    print(f"Wrote: {out_path}")
    print(f"Wrote: {out_dir / 'ess_r8_r11_countries.txt'} (n={len(countries)})")


if __name__ == "__main__":
    main()
